#pragma once
template <typename T>
void transpose_cpu(T *h_out, T *h_in, const unsigned int m,
                   const unsigned int n);
template <typename T>
void transpose_cuda(T *h_out, T *h_in, const unsigned int m,
                    const unsigned int n);
